{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Empirical evaluation of the impact of k-means initialization\n\n\nEvaluate the ability of k-means initializations strategies to make\nthe algorithm convergence robust as measured by the relative standard\ndeviation of the inertia of the clustering (i.e. the sum of squared\ndistances to the nearest cluster center).\n\nThe first plot shows the best inertia reached for each combination\nof the model (``KMeans`` or ``MiniBatchKMeans``) and the init method\n(``init=\"random\"`` or ``init=\"kmeans++\"``) for increasing values of the\n``n_init`` parameter that controls the number of initializations.\n\nThe second plot demonstrate one single run of the ``MiniBatchKMeans``\nestimator using a ``init=\"random\"`` and ``n_init=1``. This run leads to\na bad convergence (local optimum) with estimated centers stuck\nbetween ground truth clusters.\n\nThe dataset used for evaluation is a 2D grid of isotropic Gaussian\nclusters widely spaced.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(__doc__)\n\n# Author: Olivier Grisel <olivier.grisel@ensta.org>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport matplotlib.cm as cm\n\nfrom sklearn.utils import shuffle\nfrom sklearn.utils import check_random_state\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.cluster import KMeans\n\nrandom_state = np.random.RandomState(0)\n\n# Number of run (with randomly generated dataset) for each strategy so as\n# to be able to compute an estimate of the standard deviation\nn_runs = 5\n\n# k-means models can do several random inits so as to be able to trade\n# CPU time for convergence robustness\nn_init_range = np.array([1, 5, 10, 15, 20])\n\n# Datasets generation parameters\nn_samples_per_center = 100\ngrid_size = 3\nscale = 0.1\nn_clusters = grid_size ** 2\n\n\ndef make_data(random_state, n_samples_per_center, grid_size, scale):\n    random_state = check_random_state(random_state)\n    centers = np.array([[i, j]\n                        for i in range(grid_size)\n                        for j in range(grid_size)])\n    n_clusters_true, n_features = centers.shape\n\n    noise = random_state.normal(\n        scale=scale, size=(n_samples_per_center, centers.shape[1]))\n\n    X = np.concatenate([c + noise for c in centers])\n    y = np.concatenate([[i] * n_samples_per_center\n                        for i in range(n_clusters_true)])\n    return shuffle(X, y, random_state=random_state)\n\n# Part 1: Quantitative evaluation of various init methods\n\nplt.figure()\nplots = []\nlegends = []\n\ncases = [\n    (KMeans, 'k-means++', {}),\n    (KMeans, 'random', {}),\n    (MiniBatchKMeans, 'k-means++', {'max_no_improvement': 3}),\n    (MiniBatchKMeans, 'random', {'max_no_improvement': 3, 'init_size': 500}),\n]\n\nfor factory, init, params in cases:\n    print(\"Evaluation of %s with %s init\" % (factory.__name__, init))\n    inertia = np.empty((len(n_init_range), n_runs))\n\n    for run_id in range(n_runs):\n        X, y = make_data(run_id, n_samples_per_center, grid_size, scale)\n        for i, n_init in enumerate(n_init_range):\n            km = factory(n_clusters=n_clusters, init=init, random_state=run_id,\n                         n_init=n_init, **params).fit(X)\n            inertia[i, run_id] = km.inertia_\n    p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1))\n    plots.append(p[0])\n    legends.append(\"%s with %s init\" % (factory.__name__, init))\n\nplt.xlabel('n_init')\nplt.ylabel('inertia')\nplt.legend(plots, legends)\nplt.title(\"Mean inertia for various k-means init across %d runs\" % n_runs)\n\n# Part 2: Qualitative visual inspection of the convergence\n\nX, y = make_data(random_state, n_samples_per_center, grid_size, scale)\nkm = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1,\n                     random_state=random_state).fit(X)\n\nplt.figure()\nfor k in range(n_clusters):\n    my_members = km.labels_ == k\n    color = cm.nipy_spectral(float(k) / n_clusters, 1)\n    plt.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color)\n    cluster_center = km.cluster_centers_[k]\n    plt.plot(cluster_center[0], cluster_center[1], 'o',\n             markerfacecolor=color, markeredgecolor='k', markersize=6)\n    plt.title(\"Example cluster allocation with a single random init\\n\"\n              \"with MiniBatchKMeans\")\n\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}